############################################
############################################
# previous: 2-VAP_data_justspp #############
############################################

###### Summary ############################################################################################
#This script pepares all files necessary for running full maxent models using all starting variables. For each species, it writes a .csv file 
#("swd" file) with columns showing values for each predictor variable extracted for each lat &long. It summarizes variables for each species as 
#boxplots. It then a combines data from differet species into modelling units. Most of these are the true listed single species but in some cases
#records from different species are combined - this is done only for closely related species where species form a phylogenetic unit, e.g. one 
#species used to be a subspecies of the other or is its (assumed or confirmed) sister taxon AND venoms are assumed to be similar AND
#1. one species is data deficient (<20 records) and therefore cannot reliably be used to create a separate model
#OR
#2. the taxonomy, species status and/or geographic delineation of lineages within the group is poorly resolved, creating uncertainty whether to 
#assign individual records to any one particular species within the group.
#For each modelling unit, the script creates a buffer the width of the unit's maximum range diameter (calculated based on occurrence records) 
#around known occurrences and creates a background file of all snake records from within the buffered area. This 'target' background is used as 
#a sample of all locations that could reasonably be reached by the species within its recent evolutionary history and is used as a contrast 
#between places where the species has been observed and places where people have looked for snakes but nt seen this particular species.
#Finally, the script creates a shell file that runs Maxent with pre-define dparamters for each modelling unit using the species swd & background 
#files. Models are run without extrapolation beyond input values for each variable and at 10-fold cross-validation using random 70% of unique 
#occurrence points to train the model and 30% to to set aside for model testing.
###########################################################################################################

############################################
# next: 4-VAP_VarSelect ####################
############################################

################################
######### 300h;  30GB ##########
################################


#load packages:
library(R.utils)
library(raster)
library(maptools)
library(rgdal)
library(stringr)
library(sf)
library(e1071)

# define location of maxent.jar file
maxent.jar <- paste("/home/jc217070/Maxent/maxent.jar",sep="")

#set project name:
projectName<-"WHOSnakes"

#define directories:
MyDir<-paste("/home/jc217070/Maxent",sep="")
LayersDir<-paste("/home/jc217070/Maxent/Layers")
OutDir<-paste(MyDir,"/Outputs1_Full",sep="")
SWDDir<-paste(MyDir, "/Maxent_SWDs/spp_swds",sep="")
PlotDir<-paste(MyDir,"/Maxent_Data/Plots/Plots_WHOSnakes_2020_09_01",sep="")

#create & define PBS directory for output files (errors and R outs)
dir.create(path=file.path(MyDir,"tmp.pbs"),showWarnings = FALSE)
PbsDir <- paste(MyDir, "/tmp.pbs", sep=""); setwd(PbsDir)

#read species reference list:
DatSum<-read.table(file=file.path(paste(MyDir, "/Maxent_LUTables/",projectName,".csv",sep="")), header=TRUE, sep=",") #load data summary for all species
#DatSum<-read.table(file=file.path(paste(MyDir, "/Maxent_LUTables/",projectName,"_Fin1.csv",sep="")), header=TRUE, sep=",") #load data summary for all species
spp<-DatSum$gen_sp_subtax

crit<-c("permutation.importance")   #c("permutation.importance","contribution","Test.gain.without","Test.gain.with.only","AUC.without","AUC.with.only")
critThresh<-c(1)                     #c(1,1,1,10,1,75)

#get asci names:
asclist<-list.files(path=paste(LayersDir), pattern=paste("\\.asc"),full.names=TRUE)
asclist<- asclist[order(asclist)]               #sort alphabetically
asclist<-gsub("\\.gz","",asclist)
asclist<-unique(asclist)

asclist<-asclist[1:length(asclist)] #sort order is different on HPC for some weird reason




snapraster<-raster(nrows=18000,ncols=36000,xmn=-180,xmx=180,ymn=-90,ymx=90)

#unzip & stack ascis
ascstack <- stack()
for (asc in asclist){
  if (file.exists(paste(asc,".gz",sep=""))){
  Sys.sleep(0.1)
  print(paste(Sys.time()," unzipping ",asc, sep=""))
  gunzip(filename=paste(asc,".gz",sep=""),
         destname=paste(asc),
         remove=TRUE, overwrite=TRUE)
  }}
  
for (asc in asclist){
  
  Sys.sleep(0.1)
  print(paste(Sys.time()," stacking ",asc, sep=""))
  layer<-raster(file.path(asc))
  layer
  
  # if(dim(layer)[1] != 18000){
  #   layer<-resample(layer,snapraster,method="ngb")
  #   writeRaster(layer, filename=paste(asc,sep=""), format="ascii", overwrite=TRUE)
  #   
  # }
  # 
  
  ascstack <- stack(ascstack, layer)
}

#define projection that I want to project rasters to
proj4_WGS84 <- CRS("+init=epsg:4326")
worldmap <- st_read(paste(PlotDir,"/worldmap/WHO_countries.shp",sep=""))
bg<-raster(paste(PlotDir,"/Cop_FAPAR_mean_2010-2019_WW1km.asc",sep=""), crs="+init=epsg:4326")
GLOBraster<-bg
values(GLOBraster)<-ifelse(values(GLOBraster)!=-Inf,0,-Inf) #create 'empty' raster of same extent/res as hillshade
GLOBrasterB<-GLOBraster
values(GLOBrasterB)<-ifelse(values(GLOBrasterB)==0,1,values(GLOBrasterB)) #create 'empty' raster of same extent/res as hillshade
snapraster<-raster(nrows=18000,ncols=36000,xmn=-180,xmx=180,ymn= -90,ymx= 90,crs=proj4_WGS84)

memory.limit(size=500000)

#########################
# make swds for species #
#########################

backgr<-data.frame()
l<- 0

sppB<-DatSum$gen_sp_subtax[DatSum$Redo==1]
#sppB<-c("Walterinnesia_aegyptia","Walterinnesia_morgani")

for (sp in spp){
  if (sp %in% sppB){
    if(file.exists(paste(MyDir, "/Maxent_SWDs/spp_occs/",sp,".csv",sep=""))){
      
  n<-which(spp==sp)
  sprecs<-read.table(file=paste(MyDir, "/Maxent_SWDs/spp_occs/",sp,".csv",sep=""), header=TRUE, sep=",") #load data summary for all species
  
  Sys.sleep(0.1)
  print(paste(Sys.time()," extracting occurrence data for ", sp,
              ", number of records= ",length(sprecs[,1]), sep=""))
  
  #extract values from ascstack
  coords<-data.frame(long=as.numeric(sprecs$long),lat=as.numeric(sprecs$lat))  # get coordinates of occurrence records
  spcoords <- SpatialPoints(coords[,c(1,2)])          # makes spatial version of occurrence coordinates
  vals<-extract(ascstack,spcoords)
  
  #for MaxEnt, first column defines taxon so all rows need to have the same name or it will interpret species and subspecies as separate taxa in models:
  sprecs[,1]<-paste(sp)
  colnames(sprecs)<-c("species","longitude","latitude")# change column names of record data frame

  sprecs<-cbind(sprecs, vals)                          # bind data to original species data frame
  #write full table so I can check manually for layers that result in NAs
  write.table(x=sprecs, file=paste(SWDDir, "/", sp,"withNAs.csv",sep=""),   
              na = "NA", row.names = FALSE, col.names = TRUE, sep=",")     #write .csv file of records associated with variable values (swd)

  sprecs<-na.omit(sprecs)                              # only keep records with data from all included layer (i.e. no "NA"s)
  
  DatSum$SPswdrecs[n]<-paste(length(sprecs[,1]))
  write.table(x=DatSum, file=paste(MyDir, "/Maxent_LUTables/",projectName,"_Fin2b.csv",sep=""),
              row.names = FALSE, col.names = TRUE, sep=",")
  
  
  if(length(sprecs[,1]>0)){
    
    Sys.sleep(0.1)
    print(paste(Sys.time()," binding data to venomous background and plotting summary stats for ", sp,
                ", number of records= ",length(sprecs[,1]), sep=""))
    
    write.table(x=sprecs, file=paste(SWDDir, "/", sp,".csv",sep=""),   
                na = "NA", row.names = FALSE, col.names = TRUE, sep=",")     #write .csv file of records associated with variable values (swd)
  
    #bind to background for correlation matrix
    
    if(n>1){ #if this is not the first species, read previously pre-saved background back in
      backgr<-read.table(file=paste(MyDir, "/Maxent_SWDs/backgrounds/",projectName,"_bgSWD_prelim.csv",sep=""), 
                         header=TRUE, sep=",") #load data summary for all species
      l<-length(backgr[,1])+length(sprecs[,1])
      Sys.sleep(0.1)
      print(paste(Sys.time()," binding data to background, length= ", l, sep=""))
      
    } else { #otherwise continue with initial placeholder background dataframe to bind new data to
    
    l<-l+length(sprecs[,1])
    Sys.sleep(0.1)
    print(paste(Sys.time()," binding data to background, length= ", l, sep=""))
    }
    
    backgr<-rbind(backgr,sprecs)
    write.table(x=backgr, file=paste(MyDir, "/Maxent_SWDs/backgrounds/",projectName,"_bgSWD_prelim.csv",sep=""),   
                na = "NA", row.names = FALSE, col.names = TRUE, sep=",")

    ################################################################
    #make boxplot of all data for each species to look for outliers#
    ################################################################
    
    n<-which(spp==sp)
    sprecs<-read.table(file=paste(MyDir, "/Maxent_SWDs/spp_swds/",sp,".csv",sep=""), header=TRUE, sep=",") #load data summary for all species
    
    vars<-names(sprecs)[4:length(sprecs)]
    colns<-ceiling(sqrt(length(vars)))
    
    ###############boxplots#######################
    
    png(filename=paste(SWDDir,"/",sp,"_VarHistBox.png",sep=""), units="in", width=40, height=20, res=500) #png smaller file size hat pdf
    par(mfrow = c(6,length(vars)), mar = c(1,1,1,1), oma=c(1,1,2,1), mgp=c(1.2,0.5,0))
    
    ###############boxplots#######################
    for (var in vars){
      v<-which(vars==var)
      varname<-gsub("wc2.0_bio_30s_","Bio",gsub("_WW1km","",gsub("_2010.2019FILL","",gsub("nn_","",
               gsub("ESA_HA_HL_","",gsub("_0.5cm_Med","",gsub("Amat2018_","",gsub("NEW_ESA_","",var))))))))
      myvar<-((sprecs[,v+3]-mean(sprecs[,v+3]))/(sd(sprecs[,v+3])+0.00001)) #standardize variable
      myvar<-myvar+(sqrt(min(myvar)^2)) #make positive
      if(!(is.na(min(myvar)))){
      boxplot(myvar, range=2, main=paste(varname)) #boxplot of un-transformed variable
      }else{
        plot(1~1, main=paste(varname), xlab=FALSE, ylab=FALSE, col="white", pch=1)
        text(0.7,0.7, labels=expression("NA"),pos=4,  col="black", cex=2)
      }
    }
    
    for (var in vars){
      print(paste(var))
      v<-which(vars==var)
      varname<-gsub("wc2.0_bio_30s_","Bio",gsub("_WW1km","",gsub("_2010.2019FILL","",gsub("nn_","",
               gsub("ESA_HA_HL_","",gsub("_0.5cm_Med","",gsub("Amat2018_","",gsub("NEW_ESA_","",var))))))))
      myvar<-((sprecs[,v+3]-mean(sprecs[,v+3]))/(sd(sprecs[,v+3])+0.00001)) #standardize variable
      myvar<-myvar+(sqrt(min(myvar)^2)) #make positive
      name<-"no transform"
      if(skewness(myvar)>0.5|is.na(skewness(myvar))){
        myvar<-log(myvar+1)
        name<-"log"
        if(skewness(myvar)>0.5 |is.na(skewness(myvar))){
          myvar<-log(myvar+1)
          name<-"loglog"
        }
      } else{
        if(skewness(myvar)< -0.5){
          myvar<-(myvar)^2
          name<-"sqrd"
          if(skewness(myvar)< -0.5){
            myvar<-(myvar)^2
            name<-"2sqrd"
          }
        }
      }
      if(!(is.na(min(myvar)))){
      boxplot(myvar, range=2, main=paste(name)) #boxplot of transformed variable
      }else{
        plot(1~1, main=paste(""), xlab=FALSE, ylab=FALSE, col="white", pch=1)
        text(0.7,0.7, labels=expression("NA"),pos=4,  col="black", cex=2)
      }
    }
    
    
    #############histograms##################
    
    if(length(sprecs[,1])>1){
      for (var in vars){ #plot raw histogram
        v<-which(vars==var)
        varname<-gsub("wc2.0_bio_30s_","Bio",gsub("_WW1km","",gsub("_2010.2019FILL","",gsub("nn_","",
                 gsub("ESA_HA_HL_","",gsub("_0.5cm_Med","",gsub("Amat2018_","",gsub("NEW_ESA_","",var))))))))
        myvar<-((sprecs[,v+3]-mean(sprecs[,v+3]))/(sd(sprecs[,v+3])+0.00001)) #standardize variable
        myvar<-myvar+(sqrt(min(myvar)^2)) #make positive
        if(!(is.na(min(myvar)))){
        hist(myvar, freq=FALSE, breaks=10, main=paste(varname))
        lines(x = density(myvar, adjust=2), col = "red",lty="dotted")
        legend(legend=paste("skew=",round(skewness(myvar),1)),x="topleft", cex=1, bty="n")
        }else{
          plot(1~1, main=paste(varname), xlab=FALSE, ylab=FALSE, col="white", pch=1)
          text(0.7,0.7, labels=expression("NA"),pos=4,  col="black", cex=2)
        }
      }
      
      for (var in vars){ #transform variable and plot updated histogram
        v<-which(vars==var)
        varname<-gsub("wc2.0_bio_30s_","Bio",gsub("_WW1km","",gsub("_2010.2019FILL","",gsub("nn_","",
                 gsub("ESA_HA_HL_","",gsub("_0.5cm_Med","",gsub("Amat2018_","",gsub("NEW_ESA_","",var))))))))
        myvar<-((sprecs[,v+3]-mean(sprecs[,v+3]))/(sd(sprecs[,v+3])+0.00001)) #standardize variable
        myvar<-myvar+(sqrt(min(myvar)^2)) #make positive
        name<-"no transform"
        if(skewness(myvar)>0.5|is.na(skewness(myvar))){
          myvar<-log(myvar+1)
          name<-"log"
          if(skewness(myvar)>0.5|is.na(skewness(myvar))){
            myvar<-log(myvar+1)
            name<-"loglog"
          }
        } else{
          if(skewness(myvar)< -0.5){
            myvar<-(myvar)^2
            name<-"sqrd"
            if(skewness(myvar)< -0.5){
              myvar<-(myvar)^2
              name<-"2sqrd"
            }
          }
        }
        if(!(is.na(min(myvar)))){
        hist(myvar, freq=FALSE, breaks=10, main=paste(name))
        lines(x = density(myvar, adjust=2), col = "red",lty="dotted")
        legend(legend=paste("skew=",round(skewness(myvar),1)),x="topleft", cex=1, bty="n")
        }else{
          plot(1~1, main=paste(""), xlab=FALSE, ylab=FALSE, col="white", pch=1)
          text(0.7,0.7, labels=expression("NA"),pos=4,  col="black", cex=2)
        }
      }
    }
    
    dev.off()
    
    
    ######################################
    ######################################
  
  }
}
}
}




#I used to use the following as a background but now I juts use it for the correlation matrix to look for collinearity
#the all snakes background is now used for the models bt first cut back to a buffer region around each species' occurrence records

# #remove duplicates from my species background & save
# backgr$lattrunc1km<-trunc(as.numeric(backgr$lat)*100)/100                 #truncates lat to 2 decimal points for 0.01 dec degree (~1km grid cell) analysis without (!) rounding
# backgr$longtrunc1km<-trunc(as.numeric(backgr$long)*100)/100               #as above, for longitude
# backgr<-backgr[!duplicated(backgr[,c("longtrunc1km", "lattrunc1km")]),]
# backgr$lattrunc1km<-NULL
# backgr$longtrunc1km<-NULL
# 
# #write background (just for stats) based on only my species:
# write.table(x=backgr, file=paste(MyDir, "/Maxent_SWDs/backgrounds/",projectName,"_bgSWD.csv",sep=""),   
#             na = "NA", row.names = FALSE, col.names = TRUE, sep=",")

########################################################################################
#### load swd file for all my species data points ######################################
#### and create correlation matrix of variables: #######################################

# Sys.sleep(0.1)
# print(paste(Sys.time(), "producing cor matrix of my species data", sep=" "))
# 
# allData<-read.table(paste(MyDir, "/Maxent_SWDs/backgrounds/",projectName,"_bgSWD.csv",sep=""),
#                     sep=",", header=TRUE)   #read in data file
# allData$species<-NULL
# myCor<-cor(allData)
# write.table(x=myCor, file=paste(LayersDir, "/bckgr_cor.csv",sep=""),   
#             na = "NA", row.names = TRUE, col.names = TRUE, sep=",")     #write .csv file of records associated with variable values (swd)
# 










######################################################
# prep shellfiles for Maxent (submit manually later) #

#read in full raw background lat/longs:
SPBG<-read.table(file=file.path(paste(MyDir, "/Maxent_SWDs/backgrounds/",projectName,"_bgSWD_raw.csv",sep="")), header=TRUE, sep=",") #load data summary for all species
SPBG$lattrunc1km<-trunc(as.numeric(SPBG$latitude)*100)/100                 #truncates lat to 2 decimal points for 0.01 dec degree (~1km grid cell) analysis without (!) rounding
SPBG$longtrunc1km<-trunc(as.numeric(SPBG$longitude)*100)/100               #as above, for longitude
SPBG<-SPBG[!duplicated(SPBG[,c("longtrunc1km", "lattrunc1km")]),]
SPBG$lattrunc1km<-NULL
SPBG$longtrunc1km<-NULL
coords<-data.frame(long=SPBG$longitude,lat=SPBG$latitude)  # get coordinates of occurrence records
SPBGcoords <- SpatialPoints(coords[,1:2],proj4string=CRS("+init=epsg:4326"))






#make text file with qsub commands list for all species' shell files
qsub.list.file <- '/home/jc217070/Maxent/R_scripts/qsubs-3-VAP.txt'



mapspp<-unique(DatSum$ModUnit)

####################
#mapsppB<-mapspp[1:3]
mapsppB<-unique(DatSum$ModUnit[DatSum$Redo==1])
#mapsppB<-c("Walterinnesia aegyptia","Walterinnesia morgani")
####################


for (mapsp in mapspp){
  if (mapsp %in% mapsppB){

    Sys.sleep(0.1)
    print(paste(Sys.time()," writing swd file for modelling unit ", mapsp, sep=""))
    
    ns<-which(DatSum$ModUnit==mapsp)
    m<-which(mapspp==mapsp)
    maprecs<-data.frame()
    
    ### make model unit swd from species swds ####
    for (sp in DatSum[ns,"gen_sp_subtax"]){
      i<-which(DatSum[ns,"gen_sp_subtax"]==sp)
      
      if (file.exists(paste(MyDir, "/Maxent_SWDs/spp_swds/",sp,".csv",sep=""))){
        addrecs<-read.table(paste(MyDir, "/Maxent_SWDs/spp_swds/",sp,".csv",sep=""), sep=",", header=TRUE)
        maprecs<-rbind(maprecs,addrecs)
        
      }}
    
    if(length(maprecs[1,])>=1){
      
      maprecs$species<-paste(gsub(" ","_",mapsp)) #maxent need the species column to be all the same name
      maprecs<-na.omit(maprecs)
      write.table(x=maprecs, file=paste(SWDDir, "/ModUnit_",gsub(" ","_",mapsp),".csv",sep=""),qmethod = "double",
                  na = "NA", row.names = FALSE, col.names = TRUE, sep=",",quote=TRUE)
      
      DatSum$MAPswdrecs[ns]<-paste(length(maprecs[,1]))
      write.table(x=DatSum, file=paste(MyDir, "/Maxent_LUTables/",projectName,"_Fin2b.csv",sep=""),
                  row.names = FALSE, col.names = TRUE, sep=",")
      
      if(length(maprecs[,1])>=5){
        
        ################################################
        # make buffer based background for each species#
        ################################################
        
        #buffer each mapspecies' recs
        Sys.sleep(0.1)
        print(paste(Sys.time()," making buffer backgrounds for modelling unit ", mapsp, sep=""))
        
        
        #determine the width of the species' range and make buffer size that width but stay within 1000 to 3000 km.
        #background needs to represent the habitat that the species could reasonably access but doesn't occupy.
        rangewidth<-max((max(na.omit(maprecs$long))-min(na.omit(maprecs$long))),(max(na.omit(maprecs$lat))-min(na.omit(maprecs$lat))))
        rangewidthM<-rangewidth*100000
        rangewidthM<-ifelse(rangewidthM>3000000,3000000,rangewidthM)
        rangewidthM<-ifelse(rangewidthM<1000000,1000000,rangewidthM)
        
        
        
        
        
          
        
        
        
        y<-rangewidthM
        
        #make extent object describing a box around occurrence records (margin= 4 times x and y extent of occurrence records):
        xrange<- (max(na.omit(maprecs$long)))-(min(na.omit(maprecs$long)))
        yrange<- (max(na.omit(maprecs$lat)))-(min(na.omit(maprecs$lat)))
        xcenter<- (min(na.omit(maprecs$long)))+(xrange/2)
        ycenter<- (min(na.omit(maprecs$lat)))+(yrange/2)
        rangemax<-max(xrange,yrange)
        xmin<-ifelse((xcenter-(rangemax/2)- y/100000) >= -180, (xcenter-(rangemax/2)- y/100000), -180)
        xmax<-ifelse((xcenter+(rangemax/2)+ y/100000) <=  180, (xcenter+(rangemax/2)+ y/100000),  180)
        ymin<-ifelse((ycenter-(rangemax/2)- y/100000) >=  -90, (ycenter-(rangemax/2)- y/100000),  -90)
        ymax<-ifelse((ycenter+(rangemax/2)+ y/100000) <=   90, (ycenter+(rangemax/2)+ y/100000),   90)
        myext<-extent(xmin,xmax,ymin, ymax)
        
        mysnapraster<-crop(snapraster,myext)
        
        coords<-data.frame(long=maprecs$longitude,lat=maprecs$latitude)  # get coordinates of occurrence records
        spcoords <- SpatialPoints(coords[,1:2],proj4string=CRS("+init=epsg:4326"))
        Buf<-buffer(spcoords, width=rangewidthM, dissolve=TRUE) #make buffer around current habitat
        
        print(paste(Sys.time()," 1 ", mapsp, sep=""))
        
        #Buf1<-rasterize(x=Buf,y=mysnapraster,as.numeric(1))
        #values(Buf1)<-ifelse(values(Buf1)==1,1,0)
        #Buf1<-merge(Buf1,GLOBraster)
                  
        #####
        #OutDirFin<-paste(MyDir,"/Outputs3_Final/",crit,"/ModUnit_", gsub(" ","_",mapsp),sep="")
        #dir.create(path=paste(OutDirFin, sep=""), showWarnings = FALSE, recursive=TRUE)
        #writeRaster(Buf1, filename=paste(OutDirFin, "/ModUnit_", gsub(" ","_",mapsp), "_OccBGBuf.asc",sep=""), format="ascii", overwrite=TRUE)
        #####
        
        
        
        
        
        
        
        
        
        
        SPBGcoords2<-st_as_sf(SPBGcoords)
        Buf2<-st_as_sf(Buf)
        
        #clip full raw background to mapsp occurrences buffered
        coords2<-st_filter(SPBGcoords2, Buf2)
        coords3<-as.data.frame(st_coordinates(coords2))
        names(coords3)<-c("long","lat")
        coords4<-data.frame(long=coords3$long,lat=coords3$lat)  # get coordinates of occurrence records
        
        thisSPBG <- SpatialPoints(coords4[,1:2],proj4string=CRS("+init=epsg:4326"))
        
        
        
        
        
        
        
        
        #clip full raw background to mapsp occurrecnes buffered
        
        
        
        #extract background predictor variable values:
        vals<-extract(ascstack,thisSPBG)
        
        firstcol<-rep(paste(gsub(" ","_",mapsp),"BG",sep="_"),length(thisSPBG))
        finalSPBG<-cbind(firstcol,coordinates(thisSPBG)[,1],coordinates(thisSPBG)[,2])
        colnames(finalSPBG)<-c("species","longitude","latitude")# change column names of record data frame
        finalSPBG<-cbind(finalSPBG, vals)                          # bind data to original species data frame
        finalSPBG<-na.omit(finalSPBG)                              # only keep records with data from all included layer (i.e. no "NA"s)
        
        write.table(x=finalSPBG, file=paste(MyDir, "/Maxent_SWDs/backgrounds/ModUnit_",gsub(" ","_",mapsp),"_BG.csv",sep=""),qmethod = "double",
                    na = "NA", row.names = FALSE, col.names = TRUE, sep=",",quote=TRUE)
        finalSPBG<-read.table(file=file.path(paste(MyDir, "/Maxent_SWDs/backgrounds/ModUnit_",gsub(" ","_",mapsp),"_BG.csv",sep="")), 
                              header=TRUE, sep=",")
        finalSPBG<-na.omit(finalSPBG)  #has to be redone because Nas show up in weird format (NaN)                            # only keep records with data from all included layer (i.e. no "NA"s)
        write.table(x=finalSPBG, file=paste(MyDir, "/Maxent_SWDs/backgrounds/ModUnit_",gsub(" ","_",mapsp),"_BG.csv",sep=""),qmethod = "double",
                    na = "NA", row.names = FALSE, col.names = TRUE, sep=",",quote=TRUE)
        
        
        #############################################
        #############################################
        
        
        Sys.sleep(0.1)
        print(paste(Sys.time()," writing shellfile for modelling unit ", mapsp, sep=""))
        
        #define background data file
        background.swd <- paste(MyDir, "/Maxent_SWDs/backgrounds/ModUnit_",gsub(" ","_",mapsp),"_BG.csv",sep="")
        
        #define the mapspecies swd file
        occurrences.swd <- paste(SWDDir, "/ModUnit_",gsub(" ","_",mapsp),".csv",sep="")
        
        #adjust runtime and memory to bg and sp file size (works well for <35 vars, <100,000 bg recs & <20,000 sp recs):
        sprecs<-read.table(paste(occurrences.swd), sep=",", header=TRUE)   #read in species file
        
        if(length(sprecs[,1])>=5){
          
          backgr<-read.table(paste(background.swd), sep=",", header=TRUE)   #read in background file
          
          vars<-names(sprecs)[4:length(sprecs)]
          
          lbg<-length(backgr[,1]) #how many background records?
          walltime<-round(lbg/1000) #define walltime in h depending on background size based on previous experience (plus a bit)
          memory<-round(ifelse(walltime/20>3,walltime/20,3)) #define max memory according to background size
          
          lsp<-length(sprecs[,1]) #how many species records?
          walltime<-round(walltime+((lsp/10000)*walltime)) #modify walltime: add more time on base time based on sp rec number
          memory<-round(memory+((lsp/10000)*memory))+1 #modify memory: add more memory onto base memory based on sp rec number
          
          walltime<-round(walltime*(0.035*length(vars)))
          memory<-round(ifelse(memory*(0.035*length(vars))>10,memory*(0.035*length(vars)),10))
          memory<-memory+10
          
          walltime<-ifelse(walltime>200,200,walltime) #upper limit for walltime... even my biggest job here only uses 100h
          walltime<-ifelse(walltime<30,30,walltime) 
          memory<-ifelse(memory>100,100,memory) #upper limit for memory... even my biggest job here only uses 50GB mem and is not slowed down (but over 300GB vmem...)
          memory<-ifelse(memory<70,70,memory) 
          
          
          
          
          
          #product feature decision
          
          if(rangewidthM < 1000000 & length(sprecs[,1]) <80){
            pfs<-paste("noproduct")
          } else {
            pfs<-paste("product")
          }
          
          
          
          
          # create a folder in the output directory for this species and define as species specific output directory:
          dir.create(path=paste(OutDir, "/ModUnit_", gsub(" ","_",mapsp), sep=""), showWarnings = FALSE)
          SpOutDir    <-paste(OutDir, "/ModUnit_", gsub(" ","_",mapsp), sep="")
          
          # create the shell file for each species model job & submit to HPC
          shell.file.name <- paste(PbsDir, "/ModUnit_", gsub(" ","_",mapsp), ".sh", sep="")
          
          shell.file <- file(shell.file.name, "w")
          
          #complusory bits for PBS
          cat('#!/bin/bash\n', file=shell.file)
          cat('#PBS -j oe\n', file=shell.file)                                                  # combine stdout and stderr into one file
          cat('#PBS -m ae\n', file=shell.file)                                                  # combine stdout and stderr into one file
          cat('#PBS -l walltime=',paste(walltime),':00:00\n', sep='', file=shell.file)          # define walltime hh:mm:ss
          cat('#PBS -l select=1:ncpus=1:mem=',paste(memory),'gb\n', sep='', file=shell.file)    # allocate memory
          
          cat('cd $PBS_O_WORKDIR\n', file=shell.file)
          cat('shopt -s expand_aliases\n', file=shell.file)
          cat('source /etc/profile.d/modules.sh\n', file=shell.file)   # need to access modules for java
          cat('echo "Job identifier is $PBS_JOBID"\n', file=shell.file)                      # need this to run R
          cat('echo "Working directory is $PBS_O_WORKDIR"\n', file=shell.file)                      # need this to run R
          cat('module load java\n', file=shell.file)                   # need to load java for maxent
          
          # run maxent
          
          #cross-validation on 10 replicates (this run is only for stats per species so only need the crossvalid/ non-projected outputs):
          cat('java -mx',paste(memory*1000),'m -XX:-UseGCOverheadLimit -jar ',maxent.jar, ' -s ',occurrences.swd, ' -e ',background.swd, ' -o ',SpOutDir, ' -t cat_ nowarnings novisible replicatetype=subsample randomtestpoints=30 replicates=10 nooutputgrids dontextrapolate noextrapolate nothreshold allowpartialdata ',pfs,' -P -J -r -a \n', sep='', file=shell.file)
          
          close(shell.file)
          
          write(paste("qsub ", shell.file.name, sep=""), file=qsub.list.file, append=T) 
          
          #shell.file.name <- paste(PbsDir, "/ModUnit_", gsub(" ","_",mapsp), ".sh", sep="")
          #system(paste("qsub -p 5 ", shell.file.name, sep="")) #set low priority so that I can still run other jobs while all these are running
          #Sys.sleep(5) # wait 5 sec between job submissions
          
  }
  }
  }
  }
}
########################################################################################

#cd /home/jc217070/Maxent/tmp.pbs
#bash /home/jc217070/Maxent/R_scripts/qsubs-3-VAP-redo.txt





############################################
############################################
# next: 4-VAP_VarSelect ####################
############################################
